In [1]:
import sys
import os

parent_dir = os.path.abspath(os.path.join(os.getcwd(), "../" * 3))

if parent_dir not in sys.path:
    sys.path.append(parent_dir)

import qf
import matplotlib.pyplot as plt
import numpy as np
/Users/PhilippSchmidt/miniforge3/envs/quantforce/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
In [2]:
tickers: list[str] = ["AAPL", "GOOGL", "MSFT", "AMZN", "TSLA", "NFLX"]
tickers = qf.DOWJONES

data = qf.get_data(tickers, 
                   start=qf.DEFAULT_TRAIN_START, 
                   end=qf.DEFAULT_TRAIN_END, 
                   indicators="Close", 
                   imputation_method="shrinkage")
data.head(10)
Out[2]:
Ticker DIS VZ HON IBM JNJ AMZN CSCO GS MRK SHW ... KO UNH NKE BA CVX MSFT NVDA PG WMT V
Price Close Close Close Close Close Close Close Close Close Close ... Close Close Close Close Close Close Close Close Close Close
2008-03-19 26.180058 13.515104 35.439728 63.005363 38.908901 3.5085 16.110493 124.557991 22.556087 14.047793 ... 17.578745 28.229650 12.310326 54.094528 41.544502 20.672213 0.404828 41.127773 11.780885 12.514247
2008-03-20 26.733154 13.895973 34.912403 63.754250 39.208771 3.6595 16.308006 134.388535 22.750242 14.461594 ... 17.895370 28.047014 13.393426 55.088791 42.214149 21.076694 0.424542 42.073952 12.349280 14.252951
2008-03-24 26.850483 14.222988 35.375420 64.147552 38.914898 3.7975 16.880793 133.827484 23.217197 14.206326 ... 17.936417 28.356695 13.753799 56.186142 42.620007 21.069475 0.459156 42.437859 12.442080 13.229659
2008-03-25 26.884001 14.196055 36.012058 63.560299 38.711002 3.7585 16.953213 134.388535 23.437567 14.039731 ... 18.003849 28.015253 13.612436 55.898911 42.888897 21.047800 0.465804 42.177055 12.307520 14.009307
2008-03-26 26.615831 13.903667 35.774132 62.989162 38.800953 3.6900 16.314592 131.283798 23.453304 13.695791 ... 17.927624 27.078217 13.194327 56.193508 43.101967 20.628864 0.450904 42.237698 12.272721 14.166567
2008-03-27 26.297379 13.834417 35.368988 62.240253 38.752972 3.5400 15.919568 125.792442 23.495295 13.617864 ... 17.974529 26.998817 13.136591 54.661640 42.817875 20.260490 0.444485 42.098217 12.149761 13.933999
2008-03-28 25.978930 13.792100 35.915592 61.728439 38.489109 3.4880 15.853723 123.031830 23.353615 13.569499 ... 17.866056 27.316463 13.112695 54.109261 42.868599 20.159376 0.451133 42.116405 12.091764 13.900782
2008-03-31 26.297379 14.022930 36.282135 62.035549 38.902912 3.5650 15.860313 123.735023 19.911697 13.714595 ... 17.845539 27.308506 13.538771 54.772110 43.304909 20.498854 0.453655 42.498512 12.221679 13.812185
2008-04-01 26.741537 14.606075 37.163162 62.762890 39.502621 3.8350 16.446262 132.316223 20.105825 14.609385 ... 18.012646 28.810631 13.851362 55.884174 44.005013 21.307823 0.481392 43.147472 12.546475 13.646064
2008-04-02 26.448229 14.671782 37.060257 61.857727 39.166775 3.8685 16.433098 132.323669 19.465721 14.851216 ... 17.725330 28.580132 13.532800 56.598572 44.395630 21.062243 0.466721 42.747181 12.634637 13.803320

10 rows × 24 columns

In [3]:
data.plot(figsize=(20, 5), title="Close Price of All Tickers", ylabel="Price", xlabel="Date", grid=True, legend=False)
Out[3]:
<Axes: title={'center': 'Close Price of All Tickers'}, xlabel='Date', ylabel='Price'>
No description has been provided for this image
In [4]:
# Calculate log returns
#returns = data.pct_change().dropna()
log_returns = data.pct_change().apply(lambda x: np.log(1 + x)).dropna()
In [5]:
qf.plot_grid(data, n_cols=3, y_name="Price", x_name="Date", figsize=(6, 2))
qf.plot_grid(log_returns, n_cols=3, ylim=(-0.1, 0.1), figsize=(6, 2), y_name="log rate of return", x_name="Date")
qf.plot_hist_grid(log_returns, n_cols=3, bins=50, figsize=(6, 2), log_y_scale=False, x_name="log rate of return", y_name="probability density")
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [6]:
# Now we want to compare the statistics of the returns of the tickers if we use 
prices_bfill = qf.get_data(tickers, 
            start=qf.DEFAULT_TRAIN_START, 
            end=qf.DEFAULT_TRAIN_END, 
            indicators="close", 
            imputation_method="bfill",
            n_trading_days=252)

returns_bfill = prices_bfill.pct_change().dropna()
log_returns_bfill = prices_bfill.pct_change().apply(lambda x: np.log(1 + x)).dropna()


qf.plot_grid(prices_bfill, n_cols=3, figsize=(6, 2), y_name="prices using Backfill")

qf.plot_grid(log_returns_bfill, n_cols=3, ylim=(-0.1, 0.1), figsize=(6, 2), y_name="log rate of return")

qf.plot_hist_grid(log_returns_bfill, n_cols=3, bins=50, figsize=(6, 2), log_y_scale=False)
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image

Comparison of Statistics¶

In [7]:
returns_bfill_365 = qf.get_data(tickers, 
                              start=qf.DEFAULT_TRAIN_START, 
                              end=qf.DEFAULT_TRAIN_END, 
                              indicators="Close", 
                              imputation_method="bfill",
                              n_trading_days=365).pct_change().dropna()

log_returns_bfill_365 = returns_bfill_365.apply(lambda x: np.log(1 + x)).dropna()

qf.plot_hist_grid_compare([log_returns, log_returns_bfill, log_returns_bfill_365], data_names=["Shrinkage (252)", "Back-Fill (252)", "Back-Fill (365)"],
                          n_cols=2, bins=50, figsize=(8, 3), x_name="log return", y_name="probability density")
No description has been provided for this image
In [8]:
qf.plot_cdf_grid_compare([log_returns, log_returns_bfill, log_returns_bfill_365], data_names=["Shrinkage (252)", "Back-Fill (252)", "Back-Fill (365)"],
                          n_cols=2, figsize=(8, 3), x_name="log rate of return", y_name="cumulative probability")
No description has been provided for this image
In [15]:
# Calculate and compare the statistics of the returns
def calculate_statistics(returns):
    stats = {
        'mean': returns.mean(),
        'std': returns.std(),
        'skew': returns.skew(),
        'kurtosis': returns.kurtosis()
    }
    return stats

stats = calculate_statistics(log_returns)
stats_bfill = calculate_statistics(log_returns_bfill)

import pandas as pd

# Now stats is a dictionary of multiindex dataframes
# Make a new level in the index for the statistics
stats_df = pd.DataFrame(stats).T
stats_bfill_df = pd.DataFrame(stats_bfill).T

error = stats_df - stats_bfill_df

# Absoluter Fehler
abs_error = (stats_df - stats_bfill_df).abs()

# Relativer Fehler nur für std
rel_error_std = abs_error.loc['std'] / stats_df.loc['std']
In [16]:
# Plot the table of absolute errors with a name
plt.figure(figsize=(20, 10))  # Increased height to accommodate multiple lines
plt.title('Absolute Errors of the first 4 Moments of the linear rate of return between Shrinkage and Back-Fill Methods on 252 Trading Days')

# Split the data into chunks of 8 columns each
n_cols = len(abs_error.columns)
chunk_size = 4
n_chunks = (n_cols + chunk_size - 1) // chunk_size

for i in range(n_chunks):
    start_col = i * chunk_size
    end_col = min((i + 1) * chunk_size, n_cols)
    
    # Create a subplot for each chunk
    plt.subplot(n_chunks, 1, i + 1)
    plt.table(cellText=abs_error.iloc[:, start_col:end_col].round(6).values,
              rowLabels=abs_error.index,
              colLabels=abs_error.columns[start_col:end_col],
              loc='center',
              cellLoc='center')
    plt.axis('off')

plt.axis('off')
plt.tight_layout()
plt.show()
No description has been provided for this image
In [25]:
def compare_statistics(stats_df, stats_bfill_df):
    import numpy as np
    import pandas as pd

    # Fehlerberechnung
    abs_error = (stats_df - stats_bfill_df).abs()
    rel_error = abs_error / stats_df.replace(0, np.nan)

    rmse_stats = np.sqrt(
        (stats_df.loc['mean'] - stats_bfill_df.loc['mean']) ** 2 +
        (stats_df.loc['std'] - stats_bfill_df.loc['std']) ** 2
    )

    pooled_std = np.sqrt((stats_df.loc['std'] ** 2 + stats_bfill_df.loc['std'] ** 2) / 2)
    cohens_d = (stats_df.loc['mean'] - stats_bfill_df.loc['mean']) / pooled_std

    kl_divergence = np.log(stats_bfill_df.loc['std'] / stats_df.loc['std']) + (
        (stats_df.loc['std'] ** 2 + (stats_df.loc['mean'] - stats_bfill_df.loc['mean']) ** 2)
        / (2 * stats_bfill_df.loc['std'] ** 2)
    ) - 0.5

    # Ergebnis-DataFrame
    result = pd.DataFrame({
        'abs_mean_error': abs_error.loc['mean'],
        'abs_std_error': abs_error.loc['std'],
        'rel_std_error': rel_error.loc['std'],
        'rmse_stats': rmse_stats,
        'cohens_d': cohens_d,
        'kl_divergence': kl_divergence
    })

    # Bewertungsfunktion für jede Metrik
    def assess_metric(metric_name, value):
        limits = {
            'abs_mean_error': (0.0005, 0.001),
            'abs_std_error': (0.001, 0.005),
            'rel_std_error': (0.02, 0.05),
            'rmse_stats': (0.002, 0.005),
            'cohens_d': (0.2, 0.5),
            'kl_divergence': (0.005, 0.01)
        }
        low, high = limits[metric_name]
        abs_val = abs(value)
        if abs_val <= low:
            return 'Vertretbar'
        elif abs_val <= high:
            return 'Grenzwertig'
        else:
            return 'Kritisch'

    # Einzelbewertungen
    for metric in result.columns:
        result[f'{metric}_bewertung'] = result[metric].apply(lambda x: assess_metric(metric, x))

    # Gesamtbewertung (wenn ≥ 1 kritisch: "Kritisch", sonst wenn ≥ 1 grenzwertig: "Grenzwertig", sonst "Vertretbar")
    def assess_overall(row):
        status = [row[f'{metric}_bewertung'] for metric in result.columns if not metric.endswith('_bewertung')]
        if 'Kritisch' in status:
            return 'Kritisch'
        elif 'Grenzwertig' in status:
            return 'Grenzwertig'
        else:
            return 'Vertretbar'

    result['gesamt_bewertung'] = result.apply(assess_overall, axis=1)
    return result
In [27]:
result_df = compare_statistics(stats_df, stats_bfill_df)
result_df.head(24)
Out[27]:
abs_mean_error abs_std_error rel_std_error rmse_stats cohens_d kl_divergence abs_mean_error_bewertung abs_std_error_bewertung rel_std_error_bewertung rmse_stats_bewertung cohens_d_bewertung kl_divergence_bewertung gesamt_bewertung
Ticker Price
DIS Close 0.000299 0.000066 0.003445 0.000306 0.015543 0.000132 Vertretbar Vertretbar Vertretbar Vertretbar Vertretbar Vertretbar Vertretbar
VZ Close 0.000143 0.001579 0.108216 0.001586 0.009264 0.009909 Vertretbar Grenzwertig Kritisch Vertretbar Vertretbar Grenzwertig Kritisch
HON Close 0.000026 0.001112 0.058922 0.001113 -0.001327 0.003157 Vertretbar Grenzwertig Kritisch Vertretbar Vertretbar Vertretbar Kritisch
IBM Close 0.000108 0.003364 0.231695 0.003365 -0.006628 0.037992 Vertretbar Grenzwertig Kritisch Grenzwertig Vertretbar Kritisch Kritisch
JNJ Close 0.000098 0.002917 0.266492 0.002919 -0.007871 0.047996 Vertretbar Grenzwertig Kritisch Grenzwertig Vertretbar Kritisch Kritisch
AMZN Close 0.000066 0.008075 0.309676 0.008076 0.002169 0.061284 Vertretbar Kritisch Kritisch Kritisch Vertretbar Kritisch Kritisch
CSCO Close 0.000820 0.006986 0.340860 0.007034 -0.033839 0.071858 Grenzwertig Kritisch Kritisch Kritisch Vertretbar Kritisch Kritisch
GS Close 0.000047 0.007772 0.280627 0.007772 -0.001962 0.136816 Vertretbar Kritisch Kritisch Kritisch Vertretbar Kritisch Kritisch
MRK Close 0.000034 0.000017 0.000946 0.000038 -0.001926 0.000003 Vertretbar Vertretbar Vertretbar Vertretbar Vertretbar Vertretbar Vertretbar
SHW Close 0.000398 0.000219 0.012325 0.000455 0.022290 0.000394 Vertretbar Vertretbar Vertretbar Vertretbar Vertretbar Vertretbar Vertretbar
CAT Close 0.000257 0.002298 0.100855 0.002312 -0.011869 0.012226 Vertretbar Grenzwertig Kritisch Grenzwertig Vertretbar Kritisch Kritisch
TRV Close 0.000161 0.002823 0.136536 0.002827 0.008326 0.023865 Vertretbar Grenzwertig Kritisch Grenzwertig Vertretbar Kritisch Kritisch
MCD Close 0.000007 0.003591 0.297774 0.003591 0.000493 0.057524 Vertretbar Grenzwertig Kritisch Grenzwertig Vertretbar Kritisch Kritisch
AAPL Close 0.000355 0.009099 0.432445 0.009106 0.013677 0.103129 Vertretbar Kritisch Kritisch Kritisch Vertretbar Kritisch Kritisch
KO Close 0.000106 0.001770 0.137598 0.001774 -0.007709 0.015306 Vertretbar Grenzwertig Kritisch Vertretbar Vertretbar Kritisch Kritisch
UNH Close 0.000243 0.000782 0.032717 0.000819 -0.010007 0.001063 Vertretbar Vertretbar Grenzwertig Vertretbar Vertretbar Vertretbar Grenzwertig
NKE Close 0.000057 0.001899 0.098577 0.001900 0.002806 0.008314 Vertretbar Grenzwertig Kritisch Vertretbar Vertretbar Grenzwertig Kritisch
BA Close 0.000064 0.000520 0.026320 0.000524 0.003310 0.000730 Vertretbar Vertretbar Grenzwertig Vertretbar Vertretbar Vertretbar Grenzwertig
CVX Close 0.000118 0.002786 0.152803 0.002789 -0.006996 0.030836 Vertretbar Grenzwertig Kritisch Grenzwertig Vertretbar Kritisch Kritisch
MSFT Close 0.000357 0.002153 0.115625 0.002182 -0.018079 0.011292 Vertretbar Grenzwertig Kritisch Grenzwertig Vertretbar Kritisch Kritisch
NVDA Close 0.000304 0.001809 0.057766 0.001834 -0.009418 0.003081 Vertretbar Grenzwertig Kritisch Vertretbar Vertretbar Vertretbar Kritisch
PG Close 0.000168 0.002732 0.228518 0.002737 -0.012583 0.037163 Vertretbar Grenzwertig Kritisch Grenzwertig Vertretbar Kritisch Kritisch
WMT Close 0.000073 0.004724 0.386959 0.004724 -0.004947 0.087045 Vertretbar Grenzwertig Kritisch Grenzwertig Vertretbar Kritisch Kritisch
V Close 0.000675 0.010325 0.478925 0.010347 0.039257 0.691435 Grenzwertig Kritisch Kritisch Kritisch Vertretbar Kritisch Kritisch